* teach mkshort to split utf8 on grapheme boundaries.
and retire some cet functions.
* add the grapheme test.
* wean gbfile from cet.
* restore cet.cc, cet.h in prepare for deprecation.
* retire cet.cc, cet.h
* drop attempted support of \r line endings in gbfgetutf16str.
* add test of surrogate pairs for gbfgetutf16str.
* fix test
set(SUPPORT
route.cc waypt.cc filter_vecs.cc util.cc vecs.cc mkshort.cc
csv_util.cc strptime.c grtcirc.cc util_crc.cc xmlgeneric.cc
- formspec.cc xmltag.cc cet.cc cet_util.cc fatal.cc rgbcolors.cc
+ formspec.cc xmltag.cc cet_util.cc fatal.cc rgbcolors.cc
inifile.cc garmin_fs.cc units.cc gbser.cc
gbfile.cc parse.cc session.cc main.cc globals.cc
src/core/nvector.cc
endif()
set(HEADERS
- cet.h
cet_util.h
csv_util.h
defs.h
SUPPORT = route.cc waypt.cc filter_vecs.cc util.cc vecs.cc mkshort.cc \
csv_util.cc strptime.c grtcirc.cc util_crc.cc xmlgeneric.cc \
- formspec.cc xmltag.cc cet.cc cet_util.cc fatal.cc rgbcolors.cc \
+ formspec.cc xmltag.cc cet_util.cc fatal.cc rgbcolors.cc \
inifile.cc garmin_fs.cc units.cc gbser.cc \
gbfile.cc parse.cc session.cc main.cc globals.cc \
src/core/nvector.cc \
versionAtLeast(QT_VERSION, 6.0): SUPPORT += src/core/codecdevice.cc
HEADERS = \
- cet.h \
cet_util.h \
csv_util.h \
defs.h \
+++ /dev/null
-/*
-
- Character encoding transformation - basics
-
- Copyright (C) 2005-2008 Olaf Klein, o.b.klein@gpsbabel.org
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-*/
-
-#include <cstring> // for strlen
-
-#include "defs.h"
-#include "cet.h"
-
-/* ! ALL vec PARAMETERS HAVE TO BE A VALID POINTER TO A cet_cs_vec_t RECORD ! */
-
-/* =========================================================================== */
-/* %%% single character or value transmission %%% */
-/* --------------------------------------------------------------------------- */
-
-/* %%% cet_ucs4_to_utf8 %%%
- *
- * convert single UCS-4 value into UTF-8 sequence
- *
- * return values: >= 0: length of produced UTF-8 sequence
- * < 0: -bytes more needed in target space
- */
-
-int
-cet_ucs4_to_utf8(char* dest, size_t dest_size, int value)
-{
- int result;
- unsigned char trash[16];
-
- unsigned char* c = (dest != nullptr) ? (unsigned char*) dest : trash;
-
- if ((value & 0xffffff80) == 0) { /* <= 7 bits */
- if (dest_size < 1) {
- return (dest_size - 1);
- }
- *c++ = value;
- result = 1;
- } else if ((value & 0xfffff800) == 0) { /* <= 11 bits */
- if (dest_size < 2) {
- return (dest_size - 2);
- }
- *c++ = (0xc0 | (value >> 6));
- *c++ = (0x80 | (value & 0x3f));
- result = 2;
-
- } else if ((value & 0xffff0000) == 0) { /* <= 16 bits */
- if (dest_size < 3) {
- return (dest_size - 3);
- }
- *c++ = (0xe0 | (value >> 12));
- *c++ = (0x80 | ((value >> 6) & 0x3f));
- *c++ = (0x80 | (value & 0x3f));
- result = 3;
- } else if ((value & 0xffe00000) == 0) { /* <= 21 bits */
- if (dest_size < 4) {
- return (dest_size - 4);
- }
- *c++ = (0xf0 | (value >> 18));
- *c++ = (0x80 | ((value >> 12) & 0x3f));
- *c++ = (0x80 | ((value >> 6) & 0x3f));
- *c++ = (0x80 | (value & 0x3f));
- result = 4;
- } else if ((value & 0xfc000000) == 0) { /* <= 26 bits */
- if (dest_size < 5) {
- return (dest_size - 5);
- }
- *c++ = (0xf8 | (value >> 24));
- *c++ = (0x80 | ((value >> 18) & 0x3f));
- *c++ = (0x80 | ((value >> 12) & 0x3f));
- *c++ = (0x80 | ((value >> 6) & 0x3f));
- *c++ = (0x80 | (value & 0x3f));
- result = 5;
- } else if ((value & 0x80000000) == 0) { /* <= 31 bits */
- if (dest_size < 6) {
- return (dest_size - 6);
- }
- *c++ = (0xfc | (value >> 30));
- *c++ = (0x80 | ((value >> 24) & 0x3f));
- *c++ = (0x80 | ((value >> 18) & 0x3f));
- *c++ = (0x80 | ((value >> 12) & 0x3f));
- *c++ = (0x80 | ((value >> 6) & 0x3f));
- *c++ = (0x80 | (value & 0x3f));
- result = 6;
- } else {
- return 0; /* Value = -1 */
- }
- return result;
-}
-
-/* %%% cet_utf8_to_ucs4 %%%
- *
- * decode single UTF-8 sequence into UCS-4 value
- *
- * return values: 0 if success, otherwise 1
- */
-int
-cet_utf8_to_ucs4(const char* str, int* bytes, int* value)
-{
- auto* cp = (unsigned char*)str;
-
- if (*cp < 0x80) {
- if (bytes != nullptr) {
- *bytes = 1;
- }
- if (value != nullptr) {
- *value = *cp;
- }
- return CET_SUCCESS;
- } else {
- unsigned char bits = 0xc0;
- unsigned char mask = 0xe0;
-
- for (int len = 1; len <= 6; len++) { /* outer loop, test UTF-8 frame */
- if ((*cp & mask) == bits) {
- int i = len;
- while (i-- > 0) {
- cp++;
- if ((*cp & 0xc0) != 0x80) {
- break; /* invalid */
- } else if (i == 0) { /* all valid */
- const char* c = str; /* found valid sequence, now storing value */
- int res = *c++ & (mask ^ 0xFF);
- i = len;
- while (i-- > 0) {
- res = (res << 6) | (*c++ & 0x3f);
- }
-
- if (bytes != nullptr) {
- *bytes = len + 1;
- }
- if (value != nullptr) {
- *value = res;
- }
- return CET_SUCCESS;
- }
- }
- }
- bits = (bits >> 1) | 0x80;
- mask = (mask >> 1) | 0x80;
- }
- }
- if (bytes != nullptr) {
- *bytes = 1;
- }
- if (value != nullptr) {
- *value = *cp;
- }
- return CET_ERROR; /* not valid */
-}
-
-/* =========================================================================== */
-/* %%% UTF-8 string manipulation functions %%% */
-/* =========================================================================== */
-
-/* %%% cet_utf8_strlen %%%
- *
- * Returns the number of valid (visible) characters.
- */
-unsigned int
-cet_utf8_strlen(const char* str)
-{
- if (str) {
- const char* cin = str;
- int len = 0;
-
- while (*cin) {
- int bytes, value;
- if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
- len++;
- }
- cin += bytes;
- }
- return len;
- } else {
- return 0;
- }
-}
-
-/* %%% cet_utf8_strdup %%%
- *
- * Checks and duplicates an UTF-8 string
- */
-char*
-cet_utf8_strdup(const char* str)
-{
- if (str) {
- return cet_utf8_strndup(str, strlen(str));
- } else {
- return nullptr;
- }
-}
-
-/* %%% cet_utf8_strndup %%%
- *
- * Checks and duplicates an UTF-8 string
- */
-char*
-cet_utf8_strndup(const char* str, const int maxlen)
-{
- if (str) {
- const char* cin = str;
- char* cout;
- int len = 0;
-
- char* res = cout = xstrdup(cin);
-
- while (*cin && (len < maxlen)) {
- int bytes, value;
- if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
- cout += cet_ucs4_to_utf8(cout, 6, value);
- len += 1;
- }
- cin += bytes;
- }
- *cout = '\0';
-
- if ((cin - str) != (cout - res)) {
- cout = xstrdup(res);
- xfree(res);
- res = cout;
- }
-
- return res;
- } else {
- return nullptr;
- }
-}
+++ /dev/null
-/*
-
- Character encoding transformation - basics header
-
- Copyright (C) 2005-2008 Olaf Klein, o.b.klein@gpsbabel.org
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-*/
-
-#ifndef CET_H
-#define CET_H
-
-#include <cstddef> // for size_t
-
-#define CET_ERROR 1
-#define CET_SUCCESS 0
-
-/* single char/value transmission */
-
-int cet_utf8_to_ucs4(const char* str, int* bytes, int* value);
-int cet_ucs4_to_utf8(char* dest, size_t dest_size, int value);
-
-/* UTF-8 string manipulation functions */
-
-unsigned int cet_utf8_strlen(const char* str);
-char* cet_utf8_strdup(const char* str);
-char* cet_utf8_strndup(const char* str, int maxlen);
-
-#endif
void list_codecs();
void list_timezones();
+QString grapheme_truncate(const QString& input, unsigned int count);
/*
* From parse.c
--- /dev/null
+/*
+
+ Character encoding transformation - basics
+
+ Copyright (C) 2005-2008 Olaf Klein, o.b.klein@gpsbabel.org
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#include <cstring> // for strlen
+
+#include "defs.h"
+#include "cet.h"
+
+/* ! ALL vec PARAMETERS HAVE TO BE A VALID POINTER TO A cet_cs_vec_t RECORD ! */
+
+/* =========================================================================== */
+/* %%% single character or value transmission %%% */
+/* --------------------------------------------------------------------------- */
+
+/* %%% cet_ucs4_to_utf8 %%%
+ *
+ * convert single UCS-4 value into UTF-8 sequence
+ *
+ * return values: >= 0: length of produced UTF-8 sequence
+ * < 0: -bytes more needed in target space
+ */
+
+int
+cet_ucs4_to_utf8(char* dest, size_t dest_size, int value)
+{
+ int result;
+ unsigned char trash[16];
+
+ unsigned char* c = (dest != nullptr) ? (unsigned char*) dest : trash;
+
+ if ((value & 0xffffff80) == 0) { /* <= 7 bits */
+ if (dest_size < 1) {
+ return (dest_size - 1);
+ }
+ *c++ = value;
+ result = 1;
+ } else if ((value & 0xfffff800) == 0) { /* <= 11 bits */
+ if (dest_size < 2) {
+ return (dest_size - 2);
+ }
+ *c++ = (0xc0 | (value >> 6));
+ *c++ = (0x80 | (value & 0x3f));
+ result = 2;
+
+ } else if ((value & 0xffff0000) == 0) { /* <= 16 bits */
+ if (dest_size < 3) {
+ return (dest_size - 3);
+ }
+ *c++ = (0xe0 | (value >> 12));
+ *c++ = (0x80 | ((value >> 6) & 0x3f));
+ *c++ = (0x80 | (value & 0x3f));
+ result = 3;
+ } else if ((value & 0xffe00000) == 0) { /* <= 21 bits */
+ if (dest_size < 4) {
+ return (dest_size - 4);
+ }
+ *c++ = (0xf0 | (value >> 18));
+ *c++ = (0x80 | ((value >> 12) & 0x3f));
+ *c++ = (0x80 | ((value >> 6) & 0x3f));
+ *c++ = (0x80 | (value & 0x3f));
+ result = 4;
+ } else if ((value & 0xfc000000) == 0) { /* <= 26 bits */
+ if (dest_size < 5) {
+ return (dest_size - 5);
+ }
+ *c++ = (0xf8 | (value >> 24));
+ *c++ = (0x80 | ((value >> 18) & 0x3f));
+ *c++ = (0x80 | ((value >> 12) & 0x3f));
+ *c++ = (0x80 | ((value >> 6) & 0x3f));
+ *c++ = (0x80 | (value & 0x3f));
+ result = 5;
+ } else if ((value & 0x80000000) == 0) { /* <= 31 bits */
+ if (dest_size < 6) {
+ return (dest_size - 6);
+ }
+ *c++ = (0xfc | (value >> 30));
+ *c++ = (0x80 | ((value >> 24) & 0x3f));
+ *c++ = (0x80 | ((value >> 18) & 0x3f));
+ *c++ = (0x80 | ((value >> 12) & 0x3f));
+ *c++ = (0x80 | ((value >> 6) & 0x3f));
+ *c++ = (0x80 | (value & 0x3f));
+ result = 6;
+ } else {
+ return 0; /* Value = -1 */
+ }
+ return result;
+}
+
+/* %%% cet_utf8_to_ucs4 %%%
+ *
+ * decode single UTF-8 sequence into UCS-4 value
+ *
+ * return values: 0 if success, otherwise 1
+ */
+int
+cet_utf8_to_ucs4(const char* str, int* bytes, int* value)
+{
+ auto* cp = (unsigned char*)str;
+
+ if (*cp < 0x80) {
+ if (bytes != nullptr) {
+ *bytes = 1;
+ }
+ if (value != nullptr) {
+ *value = *cp;
+ }
+ return CET_SUCCESS;
+ } else {
+ unsigned char bits = 0xc0;
+ unsigned char mask = 0xe0;
+
+ for (int len = 1; len <= 6; len++) { /* outer loop, test UTF-8 frame */
+ if ((*cp & mask) == bits) {
+ int i = len;
+ while (i-- > 0) {
+ cp++;
+ if ((*cp & 0xc0) != 0x80) {
+ break; /* invalid */
+ } else if (i == 0) { /* all valid */
+ const char* c = str; /* found valid sequence, now storing value */
+ int res = *c++ & (mask ^ 0xFF);
+ i = len;
+ while (i-- > 0) {
+ res = (res << 6) | (*c++ & 0x3f);
+ }
+
+ if (bytes != nullptr) {
+ *bytes = len + 1;
+ }
+ if (value != nullptr) {
+ *value = res;
+ }
+ return CET_SUCCESS;
+ }
+ }
+ }
+ bits = (bits >> 1) | 0x80;
+ mask = (mask >> 1) | 0x80;
+ }
+ }
+ if (bytes != nullptr) {
+ *bytes = 1;
+ }
+ if (value != nullptr) {
+ *value = *cp;
+ }
+ return CET_ERROR; /* not valid */
+}
+
+/* =========================================================================== */
+/* %%% UTF-8 string manipulation functions %%% */
+/* =========================================================================== */
+
+/* %%% cet_utf8_strlen %%%
+ *
+ * Returns the number of valid (visible) characters.
+ */
+unsigned int
+cet_utf8_strlen(const char* str)
+{
+ if (str) {
+ const char* cin = str;
+ int len = 0;
+
+ while (*cin) {
+ int bytes, value;
+ if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
+ len++;
+ }
+ cin += bytes;
+ }
+ return len;
+ } else {
+ return 0;
+ }
+}
+
+/* %%% cet_utf8_strdup %%%
+ *
+ * Checks and duplicates an UTF-8 string
+ */
+char*
+cet_utf8_strdup(const char* str)
+{
+ if (str) {
+ return cet_utf8_strndup(str, strlen(str));
+ } else {
+ return nullptr;
+ }
+}
+
+/* %%% cet_utf8_strndup %%%
+ *
+ * Checks and duplicates an UTF-8 string
+ */
+char*
+cet_utf8_strndup(const char* str, const int maxlen)
+{
+ if (str) {
+ const char* cin = str;
+ char* cout;
+ int len = 0;
+
+ char* res = cout = xstrdup(cin);
+
+ while (*cin && (len < maxlen)) {
+ int bytes, value;
+ if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
+ cout += cet_ucs4_to_utf8(cout, 6, value);
+ len += 1;
+ }
+ cin += bytes;
+ }
+ *cout = '\0';
+
+ if ((cin - str) != (cout - res)) {
+ cout = xstrdup(res);
+ xfree(res);
+ res = cout;
+ }
+
+ return res;
+ } else {
+ return nullptr;
+ }
+}
--- /dev/null
+/*
+
+ Character encoding transformation - basics header
+
+ Copyright (C) 2005-2008 Olaf Klein, o.b.klein@gpsbabel.org
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef CET_H
+#define CET_H
+
+#include <cstddef> // for size_t
+
+#define CET_ERROR 1
+#define CET_SUCCESS 0
+
+/* single char/value transmission */
+
+int cet_utf8_to_ucs4(const char* str, int* bytes, int* value);
+int cet_ucs4_to_utf8(char* dest, size_t dest_size, int value);
+
+/* UTF-8 string manipulation functions */
+
+unsigned int cet_utf8_strlen(const char* str);
+char* cet_utf8_strdup(const char* str);
+char* cet_utf8_strndup(const char* str, int maxlen);
+
+#endif
*/
#include <QByteArray> // for QByteArray
+#include <QChar> // for QChar, operator==, operator!=
#include <QString> // for QString
#include <QtGlobal> // for qPrintable
#include "gbfile.h"
#include "src/core/logging.h"
-#include "cet.h" // for cet_ucs4_to_utf8
-
#if __WIN32__
/* taken from minigzip.c (part of the zlib project) */
# include <fcntl.h>
return QString(ba);
}
-static char*
-gbfgetucs2str(gbfile* file)
+static QChar
+gbfgetutf16char(gbfile* file)
{
- int len = 0;
- char* result = file->buff;
-
- for (;;) {
- char buff[8];
-
int c0 = gbfgetc(file);
- if ((c0 == EOF) && (len == 0)) {
- return nullptr;
+ if (c0 == EOF) {
+ return QChar();
}
+
int c1 = gbfgetc(file);
- if ((c1 == EOF) && (len == 0)) {
- return nullptr;
+
+ if (c1 == EOF) {
+ fatal("%s: Incomplete unicode (UTF-16%cE) character at EOF!\n",
+ file->module,
+ file->big_endian ? 'B' : 'L');
}
+ unsigned char cell;
+ unsigned char row;
if (file->big_endian) {
- c0 = c1 | (c0 << 8);
+ cell = static_cast<unsigned char>(c1);
+ row = static_cast<unsigned char>(c0);
} else {
- c0 = c0 | (c1 << 8);
+ cell = static_cast<unsigned char>(c0);
+ row = static_cast<unsigned char>(c1);
}
+ return QChar(cell, row);
+}
- if (c0 == '\r') {
+/*
+ * Reads a string from utf16 encoded file.
+ * Terminates at EOF or a line ending(\r\n, \n).
+ * Line endings are not included in the returned string.
+ * Returns a nullptr if at EOF, otherwise it returns a pointer
+ * to a possibly empty null terminated utf-8 character array.
+ * Fatal errors can occur if:
+ * i) the file ends with either an incomplete utf-16 character, or
+ * ii) the file ends with an incomplete surrogate pair, or
+ * iii) a high surrogate is not followd by a low surrogate, or
+ * iv) a low surrogate isn't preceeded by a high surrogate.
+ */
+static char*
+gbfgetutf16str(gbfile* file)
+{
+ int len = 0;
+ char* result = file->buff;
- c0 = gbfgetc(file);
- if ((c0 == EOF) && (len == 0)) {
- return nullptr;
- }
- c1 = gbfgetc(file);
- if ((c1 == EOF) && (len == 0)) {
+ for (;;) {
+ QChar qch = gbfgetutf16char(file);
+ if (qch.isNull()) {
+ if (len == 0) {
return nullptr;
- }
-
- if (file->big_endian) {
- c0 = c1 | (c0 << 8);
} else {
- c0 = c0 | (c1 << 8);
+ break;
}
+ }
- if (c0 != '\n')
- fatal("%s: Invalid unicode (UCS-2/%s endian) line break!\n",
+ if (qch == u'\r') {
+ QChar qch2 = gbfgetutf16char(file);
+ if (qch2 != u'\n') { // including qch2.isNull()
+ // Putting back two chars may not be supported, e.g. with gzapi_ungetc.
+ fatal("%s: Invalid unicode (UTF-16%cE) line break!\n",
file->module,
- file->big_endian ? "Big" : "Little");
+ file->big_endian ? 'B' : 'L');
+ }
+ break;
+ } else if (qch == u'\n') {
break;
}
- int clen = cet_ucs4_to_utf8(buff, sizeof(buff), c0);
- if (clen < 1) {
- Warning() << "Malformed UCS character" << c0 << "found.";
- return nullptr;
+ if (qch.isLowSurrogate()) {
+ fatal("%s: Leading unicode (UTF-16%cE) low surrogate!\n",
+ file->module,
+ file->big_endian ? 'B' : 'L');
}
+ QString str(qch);
+ if (qch.isHighSurrogate()) {
+ QChar qch2 = gbfgetutf16char(file);
+ if (!qch2.isLowSurrogate()) { // including qch2.isNull()
+ fatal("%s: Missing unicode (UTF-16%cE) low surrogate!\n",
+ file->module,
+ file->big_endian ? 'B' : 'L');
+ }
+ str.append(qch2);
+ }
+
+ QByteArray ba = str.toUtf8();
+ int clen = ba.size();
+
if (len+clen >= file->buffsz) {
file->buffsz += 64;
result = file->buff = (char*) xrealloc(file->buff, file->buffsz + 1);
}
- memcpy(&result[len], buff, clen);
+ memcpy(&result[len], ba.constData(), clen);
len += clen;
}
result[len] = '\0'; // terminate resulting string
char* result = file->buff;
if (file->unicode) {
- return gbfgetucs2str(file);
+ return gbfgetutf16str(file);
}
for (;;) {
if (cx == 0xFEFF) {
file->unicode = 1;
file->big_endian = 0;
- return gbfgetucs2str(file);
+ return gbfgetutf16str(file);
} else if (cx == 0xFFFE) {
file->unicode = 1;
file->big_endian = 1;
- return gbfgetucs2str(file);
+ return gbfgetutf16str(file);
} else {
gbfungetc(c1, file);
}
*/
-#include <cctype> // for isspace, toupper, isdigit
-#include <cstdio> // for sprintf, size_t
-#include <cstring> // for strlen, memmove, strchr, strcpy, strncmp, strcat, strncpy
+#include <cctype> // for isspace, toupper, isdigit
+#include <cstdio> // for sprintf, size_t
+#include <cstring> // for strlen, memmove, strchr, strcpy, strncmp, strcat, strncpy
-#include <QList> // for QList
-#include <QString> // for QString
-#include <QtGlobal> // for foreach
+#include <QByteArray> // for QByteArray
+#include <QChar> // for QChar, QChar::ReplacementCharacter
+#include <QList> // for QList
+#include <QString> // for QString
+#include <QtGlobal> // for foreach
#include "defs.h"
-#include "cet.h" // for cet_utf8_strdup, cet_utf8_strlen, cet_utf8_strndup
#define MYNAME "mkshort"
auto* hdl = (mkshort_handle_imp*) h;
if (is_utf8) {
- ostring = cet_utf8_strdup(istring); /* clean UTF-8 string */
+ /* clean UTF-8 string */
+ QString result = QString::fromUtf8(istring);
+ // QString::fromUtf8() doesn't quite promise to use QChar::ReplacementCharacter,
+ // but if it did toss them.
+ result.remove(QChar::ReplacementCharacter);
+ ostring = xstrdup(result.toUtf8().constData());
} else {
ostring = xstrdup(istring);
}
*/
if (is_utf8) {
/* ToDo: Keep trailing numeric data as described above! */
- if (cet_utf8_strlen(ostring) > hdl->target_len) {
- char* tmp = cet_utf8_strndup(ostring, hdl->target_len);
- xfree(ostring);
- ostring = tmp;
- }
+ QString result = grapheme_truncate(QString::fromUtf8(ostring), hdl->target_len);
+ xfree(ostring);
+ ostring = xstrdup(result.toUtf8().constData());
} else if ((/*i = */strlen(ostring)) > hdl->target_len) {
char* dp = &ostring[hdl->target_len] - nlen;
if (dp < ostring) {
--- /dev/null
+description,lat,lon
+Würzburg,49.7913,9.9534
+Margetshöchheim,49.8351,9.8641
+ÿEingen,48.9464,8.6694
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<gpx version="1.0" creator="GPSBabel - https://www.gpsbabel.org" xmlns="http://www.topografix.com/GPX/1/0">
+ <time>1970-01-01T00:00:00Z</time>
+ <bounds minlat="48.946400000" minlon="8.669400000" maxlat="49.835100000" maxlon="9.953400000"/>
+ <wpt lat="49.791300000" lon="9.953400000">
+ <name>Würzbr</name>
+ <cmt>Würzburg</cmt>
+ <desc>Würzburg</desc>
+ </wpt>
+ <wpt lat="49.835100000" lon="9.864100000">
+ <name>Margts</name>
+ <cmt>Margetshöchheim</cmt>
+ <desc>Margetshöchheim</desc>
+ </wpt>
+ <wpt lat="48.946400000" lon="8.669400000">
+ <name>Eingen</name>
+ <cmt>�Eingen</cmt>
+ <desc>�Eingen</desc>
+ </wpt>
+</gpx>
--- /dev/null
+No,Latitude,Longitude,Name,Description,Symbol\r
+1,35.972033,-87.134700,"GCEBB","Mountain Bike Heaven by susy1313","Waypoint"\r
+2,36.090683,-86.679550,"GC1A37","The 😁 Troll by a182pilot & Family","Waypoint"\r
+3,35.996267,-86.620117,"GC1C2B","Dive Bomber by JoGPS & family","Waypoint"\r
+4,36.038483,-86.648617,"GC25A9","FOSTER by JoGPS & Family","Waypoint"\r
+5,36.112183,-86.741767,"GC2723","Logan Lighthouse by JoGps & Family","Waypoint"\r
+6,36.064083,-86.790517,"GC2B71","Ganier Cache by Susy1313","Waypoint"\r
+7,36.087767,-86.809733,"GC309F","Shy's Hill by FireFighterEng33","Waypoint"\r
+8,36.057500,-86.892000,"GC317A","GittyUp by JoGPS / Warner Parks","Waypoint"\r
+9,36.082800,-86.867283,"GC317D","Inlighting by JoGPS / Warner Parks","Waypoint"\r
--- /dev/null
+# these are here to test gbfile utf16 reads.
+
+# Assumes nmea reader is still using gbgetstr.
+# These are handcrafted input files, they may not be legal nmea files.
+# with \d line endings
+gpsbabel -i nmea -f ${REFERENCE}/track/nmea_utf16 -o gpx -F ${TMPDIR}/nmea_utf16.gpx
+compare ${REFERENCE}/track/nmea.gpx ${TMPDIR}/nmea_utf16.gpx
+# with \r\n line endings
+gpsbabel -i nmea -f ${REFERENCE}/track/nmea_utf16_dos -o gpx -F ${TMPDIR}/nmea_utf16_dos.gpx
+compare ${REFERENCE}/track/nmea.gpx ${TMPDIR}/nmea_utf16_dos.gpx
+# Assumes pcx reader is still using gbgetstr.
+# These are handcrafted input files, they may not be legal pcx files.
+# with a unicode character from the supplemental plane encoded in utf16le.
+gpsbabel -i pcx -f ${REFERENCE}/testsupplementalplane.pcx -o unicsv -F ${TMPDIR}/testsupplementalplane.csv
+compare ${REFERENCE}/testsupplementalplane.csv ${TMPDIR}/testsupplementalplane.csv
--- /dev/null
+# test mkshort utf8 to see it breaks on grapheme boundaries
+# and tosses invalid sequences.
+# note grapheme.csv uses the combining diacritical mark U+0308.
+# note grapheme.csv has an invalid byte 0xff.
+gpsbabel -s -i unicsv -f ${REFERENCE}/grapheme.csv -o gpx,snlen=6 -F ${TMPDIR}/grapheme.gpx
+compare ${REFERENCE}/grapheme.gpx ${TMPDIR}/grapheme.gpx
#include <QList> // for QList
#include <QScopedPointer> // for QScopedPointer
#include <QString> // for QString
+#include <QTextBoundaryFinder> // for QTextBoundaryFinder, QTextBoundaryFinder::Grapheme
#include <QTextCodec> // for QTextCodec
#include <QTextStream> // for operator<<, QTextStream, qSetFieldWidth, endl, QTextStream::AlignLeft
#include <QXmlStreamAttribute> // for QXmlStreamAttribute
}
}
+QString grapheme_truncate(const QString& input, unsigned int count)
+{
+ QString output(input);
+ QTextBoundaryFinder boundary(QTextBoundaryFinder::Grapheme, input);
+ boundary.toStart();
+ unsigned int grapheme_cnt = 0;
+ QList<int> boundaries{0};
+ while (boundary.toNextBoundary() >= 0) {
+ ++grapheme_cnt;
+ boundaries.append(boundary.position());
+ }
+ if (grapheme_cnt > count) {
+ output.truncate(boundaries.at(count));
+ }
+ if constexpr(false) {
+ qDebug() << input << "->" << output << boundaries << ", limit:" <<
+ count << ", input QChars:" << input.size() << ",input graphemes:" << grapheme_cnt <<
+ ", output QChars:" << output.size();
+ }
+ return output;
+}